library(Seurat)
library(dplyr)
library(Matrix)
library(cowplot)
library(ggplot2)
SC33 is the control while SC14 and SC15 are TiO2 and Asbestos respectively. Below, we modify cell names (barcodes) so they are all unique between samples. Here, we create a merged Seurat Object to identify clusters without using CCA.
SC33.data <- Read10X(data.dir = "~/Box/Asbestos_SC/03_Analysis_V2/00_Asbestos/01_data/SC33_multirun/filtered_gene_bc_matrices/mm10/")
SC33 <- CreateSeuratObject(raw.data = SC33.data, min.cells = 3, min.genes = 200, project = "SC33")
SC33@cell.names <- paste("SC33", SC33@cell.names, sep = "_")
colnames(x = SC33@raw.data) <- paste("SC33", colnames(x = SC33@raw.data), sep = "_")
rownames(x = SC33@meta.data) <- paste("SC33", rownames(x = SC33@meta.data), sep = "_")
SC14.data <- Read10X(data.dir = "~/Box/Asbestos_SC/03_Analysis_V2/00_Asbestos/01_data/SC14/filtered_gene_bc_matrices/mm10/")
lung <- AddSamples(object = SC33, new.data = SC14.data, add.cell.id = "SC14")
SC15.data <- Read10X(data.dir = "~/Box/Asbestos_SC/03_Analysis_V2/00_Asbestos/01_data/SC15/filtered_gene_bc_matrices/mm10/")
lung <- AddSamples(object = lung, new.data = SC15.data, add.cell.id = "SC15")
In the beginning, we skip filtering and keep all cells, irrespecive of %mito and number of genes.
lung <- NormalizeData(object = lung, normalization.method = "LogNormalize", scale.factor = 10000) #Log normalize
lung <- FindVariableGenes(object = lung, mean.function = ExpMean, dispersion.function = LogVMR) #Find variable genes
length(x = lung@var.genes) #958
## [1] 958
hv.genes <- head(rownames(lung@hvg.info), 958)
mito.genes <- grep(pattern = "^mt-", x = rownames(x = lung@data), value = TRUE)
percent.mito <- Matrix::colSums(lung@raw.data[mito.genes, ])/Matrix::colSums(lung@raw.data)
lung <- AddMetaData(object = lung, metadata = percent.mito, col.name = "percent.mito")
lung <- ScaleData(object = lung, display.progress = T,vars.to.regress = c("nUMI"))
We examined our scree plot and manually inspected principal compenents for meaningful genes and clusters based on size. This led to the selection of the first 75 principal components as this is where we noted a decline in the quality of clusters being formed.
## [1] "PC1"
## [1] "Ramp2" "Cldn5" "Cdh5" "Egfl7" "Tmem100"
## [1] ""
## [1] "Rpl17" "Rps18" "Cd74" "H2-Aa" "Ptprcap"
## [1] ""
## [1] ""
## [1] "PC2"
## [1] "Ager" "Clic3" "Cldn18" "Aqp5" "Krt7"
## [1] ""
## [1] "Gngt2" "Wfdc17" "Rps18" "Car4" "Msrb1"
## [1] ""
## [1] ""
## [1] "PC3"
## [1] "Cd79a" "Cd79b" "Ly6d" "Ebf1" "Cd74"
## [1] ""
## [1] "Msrb1" "Lgals3" "Wfdc17" "Wfdc21" "Slpi"
## [1] ""
## [1] ""
## [1] "PC4"
## [1] "Igfbp5" "Aldh1a1" "Bgn" "Serping1" "Mgp"
## [1] ""
## [1] "Chil1" "Lamp3" "Slc34a2" "Sftpd" "Hc"
## [1] ""
## [1] ""
## [1] "PC5"
## [1] "Bgn" "Mgp" "Ogn" "Serping1" "Col3a1"
## [1] ""
## [1] "Ccdc153" "1110017D15Rik" "1700016K19Rik" "1700007K13Rik"
## [5] "Riiad1"
## [1] ""
## [1] ""
lung <- FindClusters(object = lung, reduction.type = "pca", dims.use = 1:75, resolution = 3, print.output = F, save.SNN = TRUE, force.recalc = T, n.start = 100)
## [1] "3 singletons identified. 49 final clusters."
lung <- RunTSNE(object = lung, dims.use = 1:75, do.fast = TRUE, check_duplicates = FALSE, perplexity = 30)
## Warning: package 'bindrcpp' was built under R version 3.4.4
##Validation of Clusters and Obtaining Marker Genes We finally used the top 50 genes in specific clusters to validate the identities of the clusters from our TSNE that stood out as they seemed to be split in unusual ways. Following this we found markers for each cluster and assigned identities to each cluster by manual expression of top marker genes.
TSNEPlot(object = lung, do.label = T)
lung <-ValidateSpecificClusters(lung, cluster1 = 0, cluster2 = 2, top.genes = 50) #B cells
## Warning: package 'caret' was built under R version 3.4.4
## [1] "Comparing cluster 0 and 2: Acc = 0.607916647434279"
## [1] "merge cluster 0 and 2"
TSNEPlot(object = lung, do.label = T)
lung <-ValidateSpecificClusters(lung, cluster1 = 1, cluster2 = 7, top.genes = 50) #AT2
## [1] "Comparing cluster 1 and 7: Acc = 0.998786513786514"
TSNEPlot(object = lung, do.label = T)
lung <-ValidateSpecificClusters(lung, cluster1 = 19, cluster2 = 7, top.genes = 50) #AT2
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## [1] "Comparing cluster 19 and 7: Acc = 0.997356254856255"
TSNEPlot(object = lung, do.label = T)
lung <-ValidateSpecificClusters(lung, cluster1 = 14, cluster2 = 7, top.genes = 50) #AT2
## [1] "Comparing cluster 14 and 7: Acc = 0.706970654534003"
## [1] "merge cluster 14 and 7"
TSNEPlot(object = lung, do.label = T)
lung <-ValidateSpecificClusters(lung, cluster1 = 12, cluster2 = 4, top.genes = 50) #AM
## [1] "Comparing cluster 12 and 4: Acc = 0.897936351068813"
## [1] "merge cluster 12 and 4"
TSNEPlot(object = lung, do.label = T)
lung.markers <- FindAllMarkers(object = lung, only.pos = TRUE, min.pct = 0.5, thresh.use = 0.5, max.cells.per.ident = 200)
#save(lung, file = "lung.clean.Robj")
C2 - B cells C3 - Classcial monocytes C4 - Alveolar macrophages C5 - Classcial monocytes C6 - T cells C7 - AT2 cells C8 - NK cells C9 - Endothelial cells (Cldn5, Kdr, Kitl, Igfbp7) C10 - Non-classical monocytes C11 - Neutrophils C13 - Endothelial cells (Cd93, Ptprb, Aqp1) C15 - T cells C16 - Peribronchial interstitial macrophages C17 - DC2 C18 - AT1 cells C20 - DC1 C21 - T cells C22 - T cells C23 - Ciliated cells C24 - Fibroblasts C25 - Neutrophils C26 - Club cells C27 - Ciliated cells C28 - pDC C29 - Doublets: Neutrophils and B cells C30 - Endothelial cells (C13) C31 - Endothelial cells (C13) C32 - Cycling AMs C33 - Mesothelial cells C34 - AT2 cells C35 - Smooth muscle cells C36 - AT2 cells C37 - Ccr7+ DC C38 - Doublets: AT2 and Neutrophils C39 - Doublets: B cells and monocytes C40 - Cell cycle/Damaged DC1 C41 - Cell cycle/Damaged T cells C42 - Mast cells/basophils C43 - Lymphatic progenitors C44 - Doublets: AM/AT2 C45 - Megakaryocytes C46 - Doublets: AM and Neutrophils C47 - Doublets: AM and NK cells
Composition of each cluster by library id:
After forming the initial Seurat Object consisting of these clusters and all three libraries we then proceeded to identify sub populations within clusters in an iterative process similar to what was previously done in the merged object. For the purposes of this markdown we show the process done for AT2 cells while other celltypes can be found in the full Rcode included (02_Asbestos_Celltype_Exploration.R).
AT2 <- SubsetData(lung, ident.use = c(7, 34, 36))
TSNEPlot(AT2, do.label = T)
TSNEPlot(AT2, do.label = T, group.by = "orig.ident")
AT2 <- FindVariableGenes(object = AT2, mean.function = ExpMean, dispersion.function = LogVMR, x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)
AT2 <- ScaleData(object = AT2, vars.to.regress = c("nUMI", "percent.mito"))
AT2 <- RunPCA(object = AT2, pc.genes = AT2@var.genes, do.print = TRUE, pcs.print = 1:5, genes.print = 5, pcs.compute = 50)
###Find Clusters and run TSNE:
AT2 <- FindClusters(object = AT2, reduction.type = "pca", dims.use = 1:15, resolution = 0.3, print.output = 0, save.SNN = TRUE, force.recalc = T)
PrintFindClustersParams(object = AT2)
## Parameters used in latest FindClusters calculation run on: 2018-06-06 16:19:36
## =============================================================================
## Resolution: 3
## -----------------------------------------------------------------------------
## Modularity Function Algorithm n.start n.iter
## 1 1 100 10
## -----------------------------------------------------------------------------
## Reduction used k.param k.scale prune.SNN
## pca 30 25 0.0667
## -----------------------------------------------------------------------------
## Dims used in calculation
## =============================================================================
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
## 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
## 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
AT2 <- RunTSNE(object = AT2, dims.use = 1:12, do.fast = TRUE, check_duplicates = FALSE)
###Resolve ambiguities by checking for known contaminating cell type markers (NK, B, Tcell markers). Here we found sub clusters 3, 4 and 5 contain B, monocytes and T/NK cell doublets, so we removed them before confirming the other sub populations we found within AT2s via reclustering.
FeaturePlot(AT2, features.plot = c("Ccr2", "Cd3e","Nkg7", "Cd79a"))
AT2.markers <- FindAllMarkers(object = AT2, only.pos = TRUE, min.pct = 0.5, thresh.use = 0.5, max.cells.per.ident = 200)
###Recheck variable genes and rescale and run PCA:
AT2 <- FindVariableGenes(object = AT2, mean.function = ExpMean, dispersion.function = LogVMR, x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)
length(x = AT2@var.genes)
AT2 <- ScaleData(object = AT2, vars.to.regress = c("nUMI"))
AT2 <- RunPCA(object = AT2, pc.genes = AT2@var.genes, do.print = TRUE, pcs.print = 1:5, genes.print = 5, pcs.compute = 20)
###Recluster and run TSNE:
AT2 <- FindClusters(object = AT2, reduction.type = "pca", dims.use = 1:8, resolution = 0.4, print.output = 0, save.SNN = TRUE, force.recalc = T)
PrintFindClustersParams(object = AT2)
## Parameters used in latest FindClusters calculation run on: 2018-06-06 16:19:36
## =============================================================================
## Resolution: 3
## -----------------------------------------------------------------------------
## Modularity Function Algorithm n.start n.iter
## 1 1 100 10
## -----------------------------------------------------------------------------
## Reduction used k.param k.scale prune.SNN
## pca 30 25 0.0667
## -----------------------------------------------------------------------------
## Dims used in calculation
## =============================================================================
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
## 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
## 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
AT2 <- RunTSNE(object = AT2, dims.use = 1:8, do.fast = TRUE, check_duplicates = FALSE, perplexity = 20)
###Find marker genes and view heatmap of top 10 genes in each subcluster:
AT2.markers <- FindAllMarkers(object = AT2, only.pos = TRUE, min.pct = 0.5, thresh.use = 0.25, test.use = "wilcox")
write.csv(AT2.markers, "AT2.markers_PC1_8_res0.4wilcox.csv")
###Vizualize the library origin of cells across the subclusters